This script reproduces all plots for Figure 1 and associated supplementary figures in PaperName. Preprocessing of raw RNA sequencing data is done in ./Folder/script, that yields allele-specific expression counts which are combined into a SingleCellExperiment (SCE) object in ./Folder/script.R. [Insert in Figure 4: CnR + methylation data preprocessing]. We first load the necessary libraries and some helper functions:
setwd("~/Desktop/Projects/XChromosome_Antonia/")
library(tidyverse)
library(ggplot2)
library(scran)
library(scater)
library(DESeq2)
library(patchwork)
source("./Scripts/auxiliary.R")
# General theme for all plots:
theme_paper <- function(){
theme(panel.background = element_blank(),
panel.grid = element_line(colour= "grey92"),
panel.grid.minor = element_line(size = rel(0.5)),
axis.line = element_line(colour = "black"),
panel.border = element_rect(colour = "black", fill=NA, size=1),
text = element_text(size = 20))
}
escape_colors <- setNames(c("grey", "darkgreen", "orange"), nm = c("silenced / variable", "facultative", "constitutive"))
Now, read in the SCE, remove non-X genes and exclude lowly expressed genes
data <- readRDS("./ProcessedData/merged_dataset_paper.rds")
data_before_filtering <- data[seqnames(data) == "X", ]
data <- data[(rowSums(counts_inactive(data)) + rowSums(counts_active(data))) / ncol(data) > 10, ]
data_with_autosomes <- data
data <- data[seqnames(data) == "X", ]
For the supplement, generate plots showing a) how many genes we detect with sufficient allele-specific coverage and b) allelic densities of autosomal and X-linked genes. Finally, we look at escape measured by d-scores (d = inact / (inact + act) = B6 / (B6 / CAST)) in two cell lines.
data_baseline_x <- data[,data$Condition == "Aux_0_Dox_0_WO_0_WOAuxNO"]
# Show for one C30 example:
sample_show = 1
colnames(data_before_filtering)[[sample_show]]
## [1] "CL30.NoDoxNoAux"
data.frame(
counts_total = counts(data_before_filtering[,sample_show])[[1]],
counts_allelic = (counts_active(data_before_filtering[,sample_show]) + counts_inactive(data_before_filtering[,sample_show]))[[1]]
) %>%
ggplot(aes(x = counts_total + 1, y = counts_allelic + 1)) + geom_point() + scale_x_log10() + scale_y_log10() +
theme_paper() + geom_vline(xintercept = 10, linetype = "dashed") + xlab("Total reads mapped") + ylab("Allele-specific reads mapped") +
geom_abline(linetype = "dashed")
ggsave("./FiguresIllustrator/FigS1/mapped_reads_c30.pdf")
data.frame(
chromosome = as.character(seqnames(rowRanges(data_with_autosomes))) == "X",
total = (counts_active(data_with_autosomes[,sample_show]) + counts_inactive(data_with_autosomes[,sample_show]))[[1]],
ase = (counts_active(data_with_autosomes[,sample_show]) / (counts_active(data_with_autosomes[,sample_show]) + counts_inactive(data_with_autosomes[,sample_show])))[[1]]
) %>%
ggplot(aes(x = ase, fill = chromosome)) + geom_density() + theme_paper() + ylab("Density") + xlab("B6 / (B6 + CAST)")
ggsave("./FiguresIllustrator/FigS1/density_ase_c30.pdf")
# Show for one E6 example:
sample_show = 31
colnames(data_before_filtering)[[sample_show]]
## [1] "C.E6.NoDox"
data.frame(
counts_total = counts(data_before_filtering[,sample_show])[[1]],
counts_allelic = (counts_active(data_before_filtering[,sample_show]) + counts_inactive(data_before_filtering[,sample_show]))[[1]]
) %>%
ggplot(aes(x = counts_total + 1, y = counts_allelic + 1)) + geom_point() + scale_x_log10() + scale_y_log10() +
theme_paper() + geom_vline(xintercept = 10, linetype = "dashed") + xlab("Total reads mapped") + ylab("Allele-specific reads mapped") +
geom_abline(linetype = "dashed")
ggsave("./FiguresIllustrator/FigS1/mapped_reads_e6.pdf")
data.frame(
chromosome = as.character(seqnames(rowRanges(data_with_autosomes))) == "X",
total = (counts_active(data_with_autosomes[,sample_show]) + counts_inactive(data_with_autosomes[,sample_show]))[[1]],
ase = (counts_active(data_with_autosomes[,sample_show]) / (counts_active(data_with_autosomes[,sample_show]) + counts_inactive(data_with_autosomes[,sample_show])))[[1]]
) %>%
ggplot(aes(x = ase, fill = chromosome)) + geom_density() + theme_paper() + ylab("Density") + xlab("B6 / (B6 + CAST)")
ggsave("./FiguresIllustrator/FigS1/density_ase_e6.pdf")
# Plot escape in individual clones
sample_nr = 1 # Clone 30
data.frame(
total = rowSums(counts_inactive(data_baseline_x[,sample_nr])) + rowSums(counts_active(data_baseline_x[,sample_nr])),
ase = rowSums(counts_inactive(data_baseline_x[,sample_nr])) / (rowSums(counts_inactive(data_baseline_x[,sample_nr])) +
rowSums(counts_active(data_baseline_x[,sample_nr])))
) %>%
rownames_to_column("Gene") %>%
ggplot(aes(total, ase)) + geom_point() + ggrepel::geom_text_repel(aes(label = Gene)) +
theme_paper() + scale_x_log10() + geom_hline(yintercept = 0.1, linetype = "dashed") + ggtitle("Clone 30") +
xlab("Total expression (allelic reads)") + ylab("ASE (d-score)")
ggsave("./FiguresIllustrator/FigS1/cl30_escape.pdf", width = 12, height = 8)
sample_nr = 3 # E6
data.frame(
total = rowSums(counts_inactive(data_baseline_x[,sample_nr])) + rowSums(counts_active(data_baseline_x[,sample_nr])),
ase = rowSums(counts_inactive(data_baseline_x[,sample_nr])) / (rowSums(counts_inactive(data_baseline_x[,sample_nr])) +
rowSums(counts_active(data_baseline_x[,sample_nr])))
) %>%
rownames_to_column("Gene") %>%
ggplot(aes(total, ase)) + geom_point() + ggrepel::geom_text_repel(aes(label = Gene)) +
theme_paper() + scale_x_log10() + geom_hline(yintercept = 0.1, linetype = "dashed") + ggtitle("Clone 30") +
xlab("Total expression (allelic reads)") + ylab("ASE (d-score)")
ggsave("./FiguresIllustrator/FigS1/e6_escape.pdf", width = 12, height = 8)
We now quantify the number of genes which escape within and across clones and cell lines.
data_here <- data[,data$ConditionClean == "Control"]
# get d-scores across genes + samples
ratios <- counts_inactive(data_baseline_x) / (counts_inactive(data_baseline_x) + counts_active(data_baseline_x))
# Genes with average ASE > 0.7 in the control sample are likely mapping artefacts:
genes_out <- names(rowMeans(ratios, na.rm = T)[rowMeans(ratios, na.rm = T) > 0.8 & rownames(ratios) != "Xist"])
genes_out_na <- rownames( ratios[!apply(ratios, 1, function(x){!any(is.na(x))}), ] )
genes_out <- c(genes_out, genes_out_na)
ratios <- ratios[!rownames(ratios) %in% genes_out, ]
colnames(ratios) <- c("CL30", "CL31", "E6 (rep1)", "E6 (rep2)", "E6 (rep3)", "CL31.16", "CL30.7")
genes_show <- c("Xist", "Mecp2", "Kdm6a", "Kdm5c")
genes_of_choice <- setNames(rep("", nrow(ratios)), rownames(ratios))
genes_of_choice[genes_show] <- genes_show
# Plot d-scores for genes after ordering the genes by chromosomale coordinate
ratios %>%
t() %>% data.frame(check.names = F) %>%
add_column("Condition" = data_here$ConditionClean) %>%
add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
pivot_longer(-c(Condition, Clone)) %>%
dplyr::filter(Condition == "Control") %>%
add_column(position = start(rowRanges(data_here[as.character(.$name), ]))) %>%
ggplot(aes(Clone, y = reorder(name, position), fill = value)) + geom_tile(width = 0.9) +
theme_paper() + theme(axis.text.x=element_text(angle = 90, hjust = 1)) +
theme(axis.ticks.y = element_blank()) + xlab("") + ylab("Chromosome position") +
scale_fill_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") +
scale_y_discrete(labels = genes_of_choice, position = "right") + ylab("") + labs(fill = "d-score")
ggsave("./FiguresIllustrator/FigS1/heatmap_ordered.pdf", width = 10, height = 10)
ratios %>%
t() %>% data.frame(check.names = F) %>%
add_column("Condition" = data_here$ConditionClean) %>%
add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
pivot_longer(-c(Condition, Clone)) %>%
dplyr::filter(Condition == "Control") %>%
add_column(position = start(rowRanges(data_here[as.character(.$name), ]))) %>%
ggplot(aes(Clone, y = reorder(name, position), fill = value)) + geom_tile(width = 0.9) +
theme_paper() + theme(axis.text.x=element_text(angle = 90, hjust = 1)) +
theme(axis.ticks.y = element_blank()) + xlab("") + ylab("Chromosome position") +
scale_fill_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") +
scale_y_discrete(labels = genes_of_choice, position = "left") + ylab("") + labs(fill = "d-score") + coord_flip()
ggsave("./FiguresIllustrator/FigS1/heatmap_ordered_streched.pdf", width = 30, height = 10)
# Plot d-scores for genes that escape in any sample, excluding genes with > 0.7 d-score
ratios <- ratios[!rownames(ratios) %in% genes_out, ]
ratios <- ratios[order(rowMeans(ratios), decreasing = T), ]
# Now we define the baseline escapees per sample as the genes with d-score > 0.1, this will be used throughout the analysis
escapees <- apply(ratios, 2, function(x){x > 0.1})
escapees_per_clone <- apply(escapees, 2, function(x){rownames(escapees)[x]})
escapees_per_clone <- lapply(escapees_per_clone, function(x){x[x != "Xist"]})
names(escapees_per_clone) <- c("CL30 - December2021", "CL31 - December2021", "E6 - October2021", "E6 - March2022", "E6 - September2022",
"CL30 - October2022", "CL31 - October2022")
# How many do we get per clone and do they overlap?
ratios %>%
t() %>% data.frame(check.names = F) %>%
add_column("Condition" = data_here$ConditionClean) %>%
add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
pivot_longer(-c(Condition, Clone)) %>%
dplyr::filter(Condition == "Control") %>%
dplyr::filter(value > 0.1) %>%
group_by(name) %>%
mutate(escape_in_clones = n() >= 7) %>% # CAVE - here we need to adjust
ggplot(aes(x = Clone, fill = escape_in_clones)) + geom_bar() + theme_paper() +
theme(axis.text.x=element_text(angle = 45, hjust = 1)) + xlab("") + ylab("Number of genes with d-score > 0.1") +
scale_fill_manual(values = c("grey", "darkred"), name = "Escape in all samples")
ggsave("./FiguresIllustrator/FigS1/number_of_escapees_per_clone.pdf", width = 8, height = 8)
ratios %>%
t() %>% data.frame(check.names = F) %>%
add_column("Condition" = data_here$ConditionClean) %>%
add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
pivot_longer(-c(Condition, Clone)) %>%
dplyr::filter(Condition == "Control") %>%
dplyr::filter(value > 0.1) %>%
group_by(name) %>%
mutate(escape_in_clones = n() >= 7) %>%
add_column(escape_group = rowData(data[.$name, ])$EscapeAnnotation) %>%
ggplot(aes(x = Clone, fill = escape_in_clones)) + geom_bar() + theme_paper() +
theme(axis.text.x=element_text(angle = 45, hjust = 1)) + xlab("") + ylab("Number of genes with d-score > 0.1") +
scale_fill_manual(values = c("grey", "darkred"), name = "Escape in all clones") + facet_wrap(~escape_group)
ggsave("./FiguresIllustrator/FigS1/number_of_escapees_per_clone_stratified.pdf", width = 8, height = 8)
For the main figure, we now plot escape during Xist-overexpression (Fig1 a-…)
# Subset data on relevant samples and for the main figure, only show E6
data_here <- data[,data$ConditionClean %in% c("Control", "Dox (3d)", "Dox (7d)", "Dox (14d)", "Dox (21d)")]
data_here <- data_here[,data_here$Clone == "E6"]
ratios <- counts_inactive(data_here) / (counts_inactive(data_here) + counts_active(data_here))
ratios <- ratios[!rownames(ratios) %in% genes_out, ]
# For Xist-expression, we need a scaling factor per sample to account for differences in sequencing depth
size_factors <- colSums(counts(data_with_autosomes)) / colSums(counts(data_with_autosomes))[[1]]
# check that Xist-overexpression works by plotting Xist counts per million (CPM) per sample
data.frame(
Dox = data_here$ndDox,
Clone = paste0(data_here$Clone, " - ", data_here$Experiment),
Expression = as.numeric(counts(data_here["Xist", ]) / colSums(counts(data_here)) * 1e6)
) %>%
add_column(Condition = ifelse(.$Dox == 0, "Control", paste0("Dox (", .$Dox, " days)"))) %>%
mutate(Condition = factor(Condition, levels = rev(c("Control", paste0("Dox (", c(3, 7, 14, 21), " days)"))))) %>%
ggplot(aes(x = Condition, y = Expression)) +
stat_summary(stat = "mean", geom = "bar", fill = "grey", col = "black") +
coord_flip() + theme_paper() +
xlab("") + ylab("Expression (CPM)") + geom_jitter(size = 3, width = 0.1) +
scale_y_continuous(expand = c(0, 0), limits = c(0, 320000)) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
ggsave("./FiguresIllustrator/Fig1/xist_overexpression.pdf", width = 4, height = 4)
# For the next heatmap, we want to highlight a few genes
genes_show <- c("Xist", "Mecp2", "Kdm6a", "Kdm5c")
genes_of_choice <- setNames(rep("", nrow(ratios)), rownames(ratios))
genes_of_choice[genes_show] <- genes_show
# Heatmap showing d-scores of all genes across Xist-OE
ratios %>%
t() %>% data.frame(check.names = F) %>%
add_column("Condition" = data_here$ConditionClean) %>%
add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
pivot_longer(-c(Condition, Clone)) %>%
add_column(position = start(rowRanges(data_here[as.character(.$name), ]))) %>%
mutate(Condition = factor(Condition, levels = rev(c("Control", "Dox (3d)", "Dox (7d)", "Dox (14d)", "Dox (21d)")))) %>%
ggplot(aes(reorder(name, position), y = Condition, fill = value, col = value)) + geom_tile(width = 0.9) +
scale_fill_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") +
scale_colour_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") +
scale_y_discrete(labels = genes_of_choice, position = "right") +
theme_paper() +
theme(axis.ticks.x = element_blank(), panel.grid.major = element_blank()) +
xlab("") +
ylab("") + theme(panel.border = element_blank(), axis.line=element_blank()) +
scale_x_discrete(labels = genes_of_choice, position = "bottom") +
labs(fill = "d-score") + guides(fill = "none")
ggsave("./FiguresIllustrator/Fig1/heatmap_ordered.pdf", width = 6, height = 4)
# Get escapees that are consistent across E6 samples
e6_escapees <- Reduce("intersect", escapees_per_clone[3:5])
# For the supplement, also plot stratified by replicates:
ratios %>%
t() %>% data.frame(check.names = F) %>%
add_column("Condition" = data_here$ConditionClean) %>%
add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
pivot_longer(-c(Condition, Clone)) %>%
add_column(position = start(rowRanges(data_here[as.character(.$name), ]))) %>%
mutate(Condition = factor(Condition, levels = rev(c("Control", "Dox (3d)", "Dox (7d)", "Dox (14d)", "Dox (21d)")))) %>%
ggplot(aes(reorder(name, position), y = interaction(Clone, Condition, sep = " -- "), group = Clone, fill = value, col = value)) + geom_tile(width = 0.9) +
scale_fill_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") +
scale_colour_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") +
scale_y_discrete(labels = genes_of_choice, position = "right") +
theme_paper() +
theme(axis.ticks.x = element_blank(), panel.grid.major = element_blank()) +
xlab("") +
ylab("") + theme(panel.border = element_blank(), axis.line=element_blank()) +
scale_x_discrete(labels = genes_of_choice, position = "bottom") +
labs(fill = "d-score")
ggsave("./FiguresIllustrator/FigS1/heatmap_ordered_by_clones.pdf", width = 10, height = 10)
# Now, we show a heatmap with only the escapees
# To structure the data, we order the genes using hierarchical clustering
data_for_clustering <- do.call("rbind", lapply(unique(data_here$ConditionClean), function(x){
if (sum(data_here$ConditionClean == x) == 1){
ratios[e6_escapees, data_here$ConditionClean == x]
} else {
rowMeans(ratios[e6_escapees, data_here$ConditionClean == x])
}
}))
clustering_order <- setNames(hclust(dist(t(data_for_clustering)))$order, rownames(ratios[e6_escapees, ]))
ratios[e6_escapees, ][clustering_order, ] %>%
t() %>% data.frame(check.names = F) %>%
add_column("Condition" = data_here$ConditionClean) %>%
add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
pivot_longer(-c(Condition, Clone)) %>%
mutate(name = factor(name, levels = unique(.$name))) %>%
add_column(position = start(rowRanges(data_here[as.character(.$name), ]))) %>%
mutate(Condition = factor(Condition, levels = rev(c("Control", "Dox (3d)", "Dox (7d)", "Dox (14d)", "Dox (21d)")))) -> to_plot
p1 <- to_plot %>%
ggplot(aes(name, y = Condition, fill = value, col = value)) + geom_tile(width = 0.9) +
scale_fill_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") +
scale_colour_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") +
theme_paper() +
theme(axis.ticks.x = element_blank(), panel.grid.major = element_blank()) +
xlab("") +
ylab("") + theme(panel.border = element_blank(), axis.line=element_blank()) +
labs(fill = "d-score") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=0, size = 8)) +
scale_x_discrete(position = "top") + guides(fill = "none")
# Plot escapee annotation at the bottom of this:
p2 <- data.frame(Gene = factor(e6_escapees, levels = levels(to_plot$name)), Annotation = rowData(data)[e6_escapees, ]$EscapeAnnotation) %>%
ggplot(aes(x = Gene, y = 1, fill = Annotation)) + theme_void() + geom_tile(col = "black") + scale_fill_manual(values = escape_colors) +
theme(legend.position = "None")
p1 / p2 + plot_layout(heights = c(4, 0.2))
ggsave("./FiguresIllustrator/Fig1/heatmap_ordered_only_escapees.pdf", width = 20, height = 4)
# Also do this reordering
p1 <- to_plot %>%
ggplot(aes(reorder(name, position), y = Condition, fill = value, col = value)) + geom_tile(width = 0.9) +
scale_fill_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") +
scale_colour_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") +
theme_paper() +
theme(axis.ticks.x = element_blank(), panel.grid.major = element_blank()) +
xlab("") +
ylab("") + theme(panel.border = element_blank(), axis.line=element_blank()) +
labs(fill = "d-score") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=0, size = 8)) +
scale_x_discrete(position = "top") + guides(fill = "none")
# Plot escapee annotation at the bottom of this:
p2 <- data.frame(Gene = factor(e6_escapees, levels = levels(to_plot$name)), Annotation = rowData(data)[e6_escapees, ]$EscapeAnnotation) %>%
ggplot(aes(x = Gene, y = 1, fill = Annotation)) + theme_void() + geom_tile(col = "black") + scale_fill_manual(values = escape_colors) +
theme(legend.position = "None")
p1 / p2 + plot_layout(heights = c(4, 0.2))
ggsave("./FiguresIllustrator/Fig1/heatmap_ordered_only_escapees_by_position.pdf", width = 20, height = 4)
# Quantify for genes that escape in E6 in the Control sample
ratios[e6_escapees, ] %>%
t() %>% data.frame() %>%
add_column("Dox" = data_here$ndDox) %>%
add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
pivot_longer(-c(Dox, Clone)) %>%
group_by(Clone) %>%
add_column(Condition = ifelse(.$Dox == 0, "Control", paste0("Dox (", .$Dox, " days)"))) %>%
mutate(Condition = factor(Condition, levels = (c("Control", paste0("Dox (", c(3, 7, 14, 21), " days)"))))) %>%
add_column(escape_status = unlist(rowData(data_here)[match(.$name, rownames(data_here)), ]$EscapeAnnotation)) %>%
mutate(escape_status = factor(escape_status, c("silenced / variable", "facultative", "constitutive"))) %>%
dplyr::filter(!is.na(escape_status)) %>%
ggplot(aes(x = Condition, y = value, fill = escape_status)) +
geom_boxplot(col = "black", outlier.color = "grey") +
theme_paper() + ylab("ASE (d-score)") +
theme(axis.text.x=element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("grey", "darkgreen", "orange"), labels = c("Silenced / Variable", "Facultative", "Constitutive")) +
labs(fill = "Escape Category") + xlab("")
ggsave("./FiguresIllustrator/Fig1/escapee_silencing_boxplots.pdf", width = 9, height = 6)
ratios %>%
t() %>% data.frame() %>%
add_column("Dox" = data_here$ndDox) %>%
add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
pivot_longer(-c(Dox, Clone)) %>%
group_by(Clone) %>%
dplyr::filter(name %in% escapees_per_clone[[unique(Clone)]]) %>%
add_column(Condition = ifelse(.$Dox == 0, "Control", paste0("Dox (", .$Dox, " days)"))) %>%
mutate(Condition = factor(Condition, levels = (c("Control", paste0("Dox (", c(3, 7, 14, 21), " days)"))))) %>%
add_column(escape_status = unlist(rowData(data_here)[match(.$name, rownames(data_here)), ]$EscapeAnnotation)) %>%
mutate(escape_status = factor(escape_status, c("silenced / variable", "facultative", "constitutive"))) %>%
dplyr::filter(!is.na(escape_status)) %>%
ggplot(aes(x = Condition, y = value, fill = Clone)) +
geom_boxplot(col = "black", outlier.color = "grey", position = position_dodge2(preserve = "single")) +
theme_paper() + ylab("ASE (d-score)") +
theme(axis.text.x=element_text(angle = 45, hjust = 1)) +
labs(fill = "Escape Category") + xlab("")
ggsave("./FiguresIllustrator/FigS1/escapee_silencing_boxplots_per_clone.pdf", width = 9, height = 6)
### For supplement, plot the heatmap etc in CL30/CL31
data_here <- data[,data$Condition %in% c("Aux_0_Dox_0_WO_0_WOAuxNO", "Aux_0_Dox_3_WO_0_WOAuxNO", "Aux_0_Dox_7_WO_0_WOAuxNO",
"Aux_0_Dox_14_WO_0_WOAuxNO", "Aux_0_Dox_21_WO_0_WOAuxNO")]
data_here <- data_here[,data_here$Clone != "E6"]
ratios <- counts_inactive(data_here) / (counts_inactive(data_here) + counts_active(data_here))
ratios <- ratios[!rownames(ratios) %in% genes_out, ]
# For Xist-expression, we need a scaling factor per sample to account for differences in seq depth
size_factors <- colSums(counts(data_with_autosomes)) / colSums(counts(data_with_autosomes))[[1]]
# check that Xist-overexpression works
data.frame(
Dox = data_here$ndDox,
Clone = paste0(data_here$Clone, " - ", data_here$Experiment),
Expression = as.numeric(counts(data_here["Xist", ]) / colSums(counts(data_here)) * 1e6)
) %>%
add_column(Condition = ifelse(.$Dox == 0, "Control", paste0("Dox (", .$Dox, " days)"))) %>%
mutate(Condition = factor(Condition, levels = rev(c("Control", paste0("Dox (", c(3, 7, 14, 21), " days)"))))) %>%
ggplot(aes(x = Condition, y = Expression)) +
stat_summary(stat = "mean", geom = "bar", fill = "grey", col = "black") +
coord_flip() + theme_paper() +
xlab("") + ylab("Xist Expression (normalized CPM)") + geom_jitter(size = 3, width = 0.1) +
scale_y_continuous(expand = c(0, 0), limits = c(0, 260000))
ggsave("./FiguresIllustrator/FigS1/xist_overexpression_cl30_31.pdf", width = 6, height = 8)
genes_show <- c("Xist", "Mecp2", "Kdm6a", "Kdm5c")
genes_of_choice <- setNames(rep("", nrow(ratios)), rownames(ratios))
genes_of_choice[genes_show] <- genes_show
ratios %>%
t() %>% data.frame(check.names = F) %>%
add_column("Condition" = data_here$ConditionClean) %>%
add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
pivot_longer(-c(Condition, Clone)) %>%
add_column(position = start(rowRanges(data_here[as.character(.$name), ]))) %>%
mutate(Condition = factor(Condition, levels = rev(c("Control", "Dox (3d)", "Dox (7d)", "Dox (14d)", "Dox (21d)")))) %>%
ggplot(aes(reorder(name, position), y = Condition, fill = value, col = value)) + geom_tile(width = 0.9) +
scale_fill_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") +
scale_colour_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") +
scale_y_discrete(labels = genes_of_choice, position = "right") +
theme_paper() +
theme(axis.ticks.x = element_blank(), panel.grid.major = element_blank()) +
xlab("") +
ylab("") + theme(panel.border = element_blank(), axis.line=element_blank()) +
scale_x_discrete(labels = genes_of_choice, position = "bottom") +
labs(fill = "d-score")
ggsave("./FiguresIllustrator/FigS1/heatmap_ordered_cl30_cl31.pdf", width = 10, height = 10)
ratios %>%
t() %>% data.frame(check.names = F) %>%
add_column("Condition" = data_here$ConditionClean) %>%
add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
pivot_longer(-c(Condition, Clone)) %>%
add_column(position = start(rowRanges(data_here[as.character(.$name), ]))) %>%
#dplyr::filter(value < 0.7) %>%
#dplyr::filter(Clone == "E6") %>%
mutate(Condition = factor(Condition, levels = rev(c("Control", "Dox (3d)", "Dox (7d)", "Dox (14d)", "Dox (21d)")))) %>%
ggplot(aes(reorder(name, position), y = interaction(Clone, Condition, sep = " -- "), group = Clone, fill = value, col = value)) + geom_tile(width = 0.9) +
scale_fill_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") +
scale_colour_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") +
scale_y_discrete(labels = genes_of_choice, position = "right") +
theme_paper() +
theme(axis.ticks.x = element_blank(), panel.grid.major = element_blank()) +
xlab("") +
ylab("") + theme(panel.border = element_blank(), axis.line=element_blank()) +
scale_x_discrete(labels = genes_of_choice, position = "bottom") +
labs(fill = "d-score")
ggsave("./FiguresIllustrator/FigS1/heatmap_ordered_by_clones_cl30_cl31.pdf", width = 10, height = 10)
escapees_cl30 <- Reduce("intersect", escapees_per_clone[-c(3:5)])
ratios[escapees_cl30, ] %>%
t() %>% data.frame() %>%
add_column("Dox" = data_here$ndDox) %>%
add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
pivot_longer(-c(Dox, Clone)) %>%
group_by(Clone) %>%
add_column(Condition = ifelse(.$Dox == 0, "Control", paste0("Dox (", .$Dox, " days)"))) %>%
mutate(Condition = factor(Condition, levels = (c("Control", paste0("Dox (", c(3, 7, 14, 21), " days)"))))) %>%
add_column(escape_status = unlist(rowData(data_here)[match(.$name, rownames(data_here)), ]$EscapeAnnotation)) %>%
mutate(escape_status = factor(escape_status, c("silenced / variable", "facultative", "constitutive"))) %>%
dplyr::filter(!is.na(escape_status)) %>%
ggplot(aes(x = Condition, y = value, fill = escape_status)) +
geom_boxplot(col = "black", outlier.color = "grey") +
theme_paper() + ylab("ASE (d-score)") +
theme(axis.text.x=element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("grey", "darkgreen", "orange"), labels = c("Silenced / Variable", "Facultative", "Constitutive")) +
labs(fill = "Escape Category") + xlab("")
ggsave("./FiguresIllustrator/Fig1/escapee_silencing_boxplots_cl30_cl31.pdf", width = 9, height = 6)
ratios %>%
t() %>% data.frame() %>%
add_column("Dox" = data_here$ndDox) %>%
add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
pivot_longer(-c(Dox, Clone)) %>%
group_by(Clone) %>%
dplyr::filter(name %in% escapees_per_clone[[unique(Clone)]]) %>%
add_column(Condition = ifelse(.$Dox == 0, "Control", paste0("Dox (", .$Dox, " days)"))) %>%
mutate(Condition = factor(Condition, levels = (c("Control", paste0("Dox (", c(3, 7, 14, 21), " days)"))))) %>%
add_column(escape_status = unlist(rowData(data_here)[match(.$name, rownames(data_here)), ]$EscapeAnnotation)) %>%
mutate(escape_status = factor(escape_status, c("silenced / variable", "facultative", "constitutive"))) %>%
dplyr::filter(!is.na(escape_status)) %>%
ggplot(aes(x = Condition, y = value, fill = Clone)) +
geom_boxplot(col = "black", outlier.color = "grey", position = position_dodge2(preserve = "single")) +
theme_paper() + ylab("ASE (d-score)") +
theme(axis.text.x=element_text(angle = 45, hjust = 1)) +
labs(fill = "Escape Category") + xlab("")
ggsave("./FiguresIllustrator/FigS1/escapee_silencing_boxplots_per_clone_cl30_cl31.pdf", width = 9, height = 6)
############
# To structure the data, we order the genes using hierarchical clustering
data_for_clustering <- do.call("rbind", lapply(unique(data_here$ConditionClean), function(x){
if (sum(data_here$ConditionClean == x) == 1){
ratios[escapees_cl30, data_here$ConditionClean == x]
} else {
rowMeans(ratios[escapees_cl30, data_here$ConditionClean == x])
}
}))
clustering_order <- setNames(hclust(dist(t(data_for_clustering)))$order, rownames(ratios[escapees_cl30, ]))
ratios[escapees_cl30, ][clustering_order, ] %>%
t() %>% data.frame(check.names = F) %>%
add_column("Condition" = data_here$ConditionClean) %>%
add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
pivot_longer(-c(Condition, Clone)) %>%
mutate(name = factor(name, levels = unique(.$name))) %>%
add_column(position = start(rowRanges(data_here[as.character(.$name), ]))) %>%
mutate(Condition = factor(Condition, levels = rev(c("Control", "Dox (3d)", "Dox (7d)", "Dox (14d)", "Dox (21d)")))) -> to_plot
p1 <- to_plot %>%
ggplot(aes(name, y = Condition, fill = value, col = value)) + geom_tile(width = 0.9) +
scale_fill_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") +
scale_colour_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") +
theme_paper() +
theme(axis.ticks.x = element_blank(), panel.grid.major = element_blank()) +
xlab("") +
ylab("") + theme(panel.border = element_blank(), axis.line=element_blank()) +
labs(fill = "d-score") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=0, size = 8)) +
scale_x_discrete(position = "top") + guides(fill = "none")
# Plot escapee annotation at the bottom of this:
p2 <- data.frame(Gene = factor(e6_escapees, levels = levels(to_plot$name)), Annotation = rowData(data)[e6_escapees, ]$EscapeAnnotation) %>%
ggplot(aes(x = Gene, y = 1, fill = Annotation)) + theme_void() + geom_tile(col = "black") + scale_fill_manual(values = escape_colors) +
theme(legend.position = "None")
p1 / p2 + plot_layout(heights = c(4, 0.2))
ggsave("./FiguresIllustrator/FigS1/cl30_heatmap_ordered_only_escapees.pdf", width = 20, height = 4)
# Also do this reordering
p1 <- to_plot %>%
ggplot(aes(reorder(name, position), y = Condition, fill = value, col = value)) + geom_tile(width = 0.9) +
scale_fill_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") +
scale_colour_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") +
theme_paper() +
theme(axis.ticks.x = element_blank(), panel.grid.major = element_blank()) +
xlab("") +
ylab("") + theme(panel.border = element_blank(), axis.line=element_blank()) +
labs(fill = "d-score") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=0, size = 8)) +
scale_x_discrete(position = "top") + guides(fill = "none")
# Plot escapee annotation at the bottom of this:
p2 <- data.frame(Gene = factor(e6_escapees, levels = levels(to_plot$name)), Annotation = rowData(data)[e6_escapees, ]$EscapeAnnotation) %>%
ggplot(aes(x = Gene, y = 1, fill = Annotation)) + theme_void() + geom_tile(col = "black") + scale_fill_manual(values = escape_colors) +
theme(legend.position = "None")
p1 / p2 + plot_layout(heights = c(4, 0.2))
ggsave("./FiguresIllustrator/Fig1/cl30_heatmap_ordered_only_escapees_by_position.pdf", width = 20, height = 4)
For the supplement, we look at some genes as examples:
# Subset data on relevant samples and for the main figure, only show E6
data_here <- data[,data$ConditionClean %in% c("Control", "Dox (3d)", "Dox (7d)", "Dox (14d)", "Dox (21d)")]
data_here <- data_here[,data_here$Clone == "E6"]
ratios <- counts_inactive(data_here) / (counts_inactive(data_here) + counts_active(data_here))
ratios <- ratios[!rownames(ratios) %in% genes_out, ]
data_test_inactive <- counts_inactive(data_here)
data_test_active <- counts_active(data_here)
# plot_gene <- function(gene, save = F, path = NULL, suffix = "_escape_plot.pdf"){
#
# i = gene
# y = as.numeric(data_test_inactive[i, ])
# N = as.numeric(data_test_active[i, ] + data_test_inactive[i, ])
#
# pp <- data.frame(
# y = y,
# N = N,
# Condition = data_here$ConditionClean,
# Dox = data_here$ndDox,
# Washout = data_here$washout,
# Clone = paste0(data_here$Clone, " - ", data_here$Experiment)
# ) %>%
# mutate(Condition = factor(Condition, levels = c("Control", "Dox (3d)", "Dox (7d)", "Dox (14d)", "Dox (21d)"))) %>%
# ggplot(aes(x = Condition, y = y / N, col = Clone)) + geom_point(size = 4) +
# ylim(0, 1) +
# geom_line(aes(group = Clone), linetype = 'dashed') + theme_paper() +
# xlab("") + ylab("ASE (d-score)") +
# #scale_x_discrete(labels = c("Dox + washout", "Dox only", "Control")) +
# NULL
# pp
#
# if (save){
# ggsave(paste0(path, "./", gene, suffix), pp)
# }
#
# return(pp)
# }
#
# plot_gene("Kdm5c", save = T, "~/Desktop/Projects/XChromosome_Antonia/Figures_new/Fig1/", "_escape_all_clones.pdf")
# plot_gene("Kdm6a", save = T, "~/Desktop/Projects/XChromosome_Antonia/Figures_new/Fig1/", "_escape_all_clones.pdf")
To make sure we observe substantial reduction in gene expression, we use DESeq2 to test for differences in escapee expression between Control and 7d. We do this either for combined, only inactive and only active counts:
data_here <- data[,data$ConditionClean %in% c("Control", "Dox (7d)")]
data_here <- data_here[,data_here$Clone == "E6"]
data_here <- data_here[unique(do.call("c", escapees_per_clone[grepl("E6", names(escapees_per_clone))])), ]
data_here <- data_here[!rownames(data_here) %in% genes_out, ]
data_for_sf <- data_with_autosomes[,colnames(data_here)]
size_factors_global <- DESeq2::estimateSizeFactors(DESeqDataSetFromMatrix(counts(data_for_sf), colData = DataFrame(Cond = rep(1, 6)), design = ~0))
size_factors_x <- DESeq2::estimateSizeFactors(DESeqDataSetFromMatrix(counts_active(data_here) + counts_inactive(data_here), colData = DataFrame(Cond = rep(1, 6)), design = ~0))
deseq_inactive <- DESeqDataSetFromMatrix(countData = counts_inactive(data_here), colData = colData(data_here), design = ~ Experiment + ConditionClean)
deseq_inactive$sizeFactor <- size_factors_global$sizeFactor
deseq_inactive <- DESeq(deseq_inactive)
deseq_active <- DESeqDataSetFromMatrix(countData = counts_active(data_here), colData = colData(data_here), design = ~ Experiment + ConditionClean)
deseq_active$sizeFactor <- size_factors_global$sizeFactor
deseq_active <- DESeq(deseq_active)
deseq_total <- DESeqDataSetFromMatrix(countData = counts(data_here), colData = colData(data_here), design = ~ Experiment + ConditionClean)
deseq_total$sizeFactor <- size_factors_global$sizeFactor
deseq_total <- DESeq(deseq_total)
df_inactive <- results(deseq_inactive) %>%
data.frame() %>%
rownames_to_column("Gene") %>%
add_column(Status = "Inactive")
df_active <- results(deseq_active) %>%
data.frame() %>%
rownames_to_column("Gene") %>%
add_column(Status = "Active")
df_total <- results(deseq_total) %>%
data.frame() %>%
rownames_to_column("Gene") %>%
add_column(Status= "Total")
rbind(df_inactive, rbind(df_active, df_total)) %>%
ggplot(aes(x = baseMean, y = log2FoldChange, col = padj < 0.1)) + geom_point() + scale_x_log10() +
ggrepel::geom_text_repel(aes(label = Gene)) + xlab("Mean Expression (log normalized counts)") + ylab("log2 (7d dox / Control)") +
theme_paper() + facet_wrap(~Status) + scale_color_manual(values = c("black", "red"))
ggsave("./FiguresIllustrator/FigS1/escape_deseq_analysis.pdf", width = 9, height = 6)
# Visualize also as a heatmap:
df_combined <- rbind(df_inactive, rbind(df_active, df_total))
df_combined$genomic_position <- start(rowRanges(data[df_combined$Gene, ]))
df_combined %>%
ggplot(aes(x = 1, y = reorder(Gene, genomic_position), fill = log2FoldChange)) + geom_tile(col = "black") + facet_wrap(~Status) +
theme_paper() + scale_fill_gradient2() + geom_text(aes(label = ifelse(padj < 0.1, ".", "")), size = 10, nudge_y = 0.7) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
ggsave("./FiguresIllustrator/FigS1/escape_deseq_analysis_heatmap.pdf", width = 9, height = 30)
Finally, we test whether the strength of silencing is associated with the genes expression level or escape level (also stratified by escapee category to avoid simpsons paradox.)
data_here <- data[,data$ConditionClean %in% c("Control", "Dox (7d)")]
ratios <- counts_inactive(data_here) / (counts_inactive(data_here) + counts_active(data_here))
ratios <- ratios[!rownames(ratios) %in% genes_out, ]
ratios <- ratios[unique(do.call("c", escapees_per_clone[grepl("E6", names(escapees_per_clone))])), ]
data_here <- data_here[unique(do.call("c", escapees_per_clone[grepl("E6", names(escapees_per_clone))])), ]
# collect expression levels and escape levels
escape_colors <- setNames(c("grey", "darkgreen", "orange"), nm = c("silenced / variable", "facultative", "constitutive"))
df_here <- data.frame(
EscapeCategory = rowData(data_here)$EscapeAnnotation,
BasalExpressionLevel = rowMeans(counts(data_here[,data_here$ConditionClean == "Control"])),
BasalInactiveExpressionLevel = rowMeans(counts_inactive(data_here[,data_here$ConditionClean == "Control"])),
BasalEscape = rowMeans(ratios[,data_here$ConditionClean == "Control"]),
EscapeDifference = rowMeans(ratios[,data_here$ConditionClean == "Dox (7d)"]) - rowMeans(ratios[,data_here$ConditionClean == "Control"])
)
df_here %>% ggplot(aes(x = BasalExpressionLevel, y = BasalEscape, col = EscapeCategory)) +
geom_point() + scale_x_log10() + theme_paper() + scale_color_manual(values = escape_colors) + geom_hline(yintercept = 0.5, linetype = 'dashed', col = "grey")
df_here %>% ggplot(aes(x = BasalExpressionLevel, y = BasalEscape, col = EscapeCategory)) +
geom_point() + scale_x_log10() + theme_paper() + scale_color_manual(values = escape_colors) + geom_smooth(method = 'lm', linetype = 'dashed') +
geom_hline(yintercept = 0.5, linetype = 'dashed', col = "grey")
ggsave("./FiguresIllustrator/FigS1/basal_exp_vs_basal_escape.pdf", width = 9, height = 6)
df_here %>% ggplot(aes(x = BasalExpressionLevel, y = EscapeDifference, col = EscapeCategory)) +
geom_point() + scale_x_log10() + theme_paper() + scale_color_manual(values = escape_colors)
df_here %>% ggplot(aes(x = BasalExpressionLevel, y = EscapeDifference, col = EscapeCategory)) +
geom_point() + scale_x_log10() + theme_paper() + scale_color_manual(values = escape_colors) + geom_smooth(method = 'lm', linetype = 'dashed')
ggsave("./FiguresIllustrator/FigS1/basal_exp_vs_basal_escape_diff.pdf", width = 9, height = 6)
df_here %>% ggplot(aes(x = BasalInactiveExpressionLevel, y = BasalEscape, col = EscapeCategory)) +
geom_point() + scale_x_log10() + theme_paper() + scale_color_manual(values = escape_colors)
df_here %>% ggplot(aes(x = BasalInactiveExpressionLevel, y = BasalEscape, col = EscapeCategory)) +
geom_point() + scale_x_log10() + theme_paper() + scale_color_manual(values = escape_colors) + geom_smooth(method = 'lm', linetype = 'dashed')
df_here %>% ggplot(aes(x = BasalInactiveExpressionLevel, y = EscapeDifference, col = EscapeCategory)) +
geom_point() + scale_x_log10() + theme_paper() + scale_color_manual(values = escape_colors)
df_here %>% ggplot(aes(x = BasalInactiveExpressionLevel, y = EscapeDifference, col = EscapeCategory)) +
geom_point() + scale_x_log10() + theme_paper() + scale_color_manual(values = escape_colors) + geom_smooth(method = 'lm', linetype = 'dashed')
# There is no relationship between these quantities
Finally, export the d-score changes across genes to a csv
data_here <- data[,data$ConditionClean %in% c("Control", "Dox (3d)", "Dox (7d)", "Dox (14d)", "Dox (21d)")]
data_here <- data_here[,data_here$Clone == "E6"]
ratios <- counts_inactive(data_here) / (counts_inactive(data_here) + counts_active(data_here))
ratios <- ratios[!rownames(ratios) %in% genes_out, ]
ratios %>%
t() %>% data.frame() %>%
add_column("Dox" = data_here$ndDox) %>%
add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
pivot_longer(-c(Dox, Clone)) %>%
group_by(Clone) %>%
dplyr::filter(name %in% escapees_per_clone[[unique(Clone)]]) %>%
ungroup() %>%
add_column(Condition = ifelse(.$Dox == 0, "Control", paste0("Dox (", .$Dox, " days)"))) %>%
mutate(Condition = factor(Condition, levels = (c("Control", paste0("Dox (", c(3, 7, 14, 21), " days)"))))) %>%
add_column(escape_status = unlist(rowData(data_here)[match(.$name, rownames(data_here)), ]$EscapeAnnotation)) %>%
dplyr::filter(!is.na(escape_status)) %>%
dplyr::select(-c(Dox, Clone)) %>%
group_by(name, Condition) %>%
summarize(value = mean(value), escape_status = unique(escape_status)) %>%
ungroup() %>%
pivot_wider(values_from = value, names_from = Condition) %>%
mutate_at(vars(matches("Dox")), list(dif = ~ . - Control)) %>%
write.csv("./ProcessedData/d_score_differences.csv")
Finally, we fit decay rates to the 0-21d timecourse. We use two models, one with and one without offset, to distinguish genes that get fully silenced vs genes that retain residual escape.
escapees_use <- as.character(unique(do.call("c", escapees_per_clone[grepl("E6", names(escapees_per_clone))])))
# Get relevant data, we only use E6 here
data_here <- data[,data$Condition %in% c("Aux_0_Dox_0_WO_0_WOAuxNO", "Aux_0_Dox_3_WO_0_WOAuxNO", "Aux_0_Dox_7_WO_0_WOAuxNO",
"Aux_0_Dox_14_WO_0_WOAuxNO", "Aux_0_Dox_21_WO_0_WOAuxNO")]
data_here <- data_here[,data_here$Clone == "E6"]
# define the exponential decay functions
exp_decay <- function(x, a, k) a * exp(-k*x)
offset_exp_decay <- function(x, a, k, c) a * exp(-k*x) + c
# these functions perform the fit per gene. We stabilize the rates with a pseudocount of 1 and use non-linear regression to fit decay curves to the data.
# We restrict the parameter space to k > 0 and b > 0
fit_nls_curve_for_gene <- function(gene){
df_here <- data.frame(
active = as.numeric(counts_active(data_here)[gene, ]) + 1,
inactive = as.numeric(counts_inactive(data_here)[gene, ]) + 1,
timepoint = as.numeric(data_here$ndDox),
experiment = data_here$Experiment
) %>% mutate(total = active + inactive) %>%
mutate(rate = inactive / (active + inactive))
tryCatch({
rates.nls <- nls(rate ~ a * exp(- k * timepoint), data = df_here, algorithm = "port", start = c(a = 0.5, k = 0.1), lower = c(a = -Inf, k = 0))
return(c(coef(rates.nls), modelr::rsquare(rates.nls, data = df_here), logLik(rates.nls)))
},
error=function(cond) {
return(NA)
})
}
fit_nls_curve_for_gene_offset <- function(gene){
df_here <- data.frame(
active = as.numeric(counts_active(data_here)[gene, ]) + 1,
inactive = as.numeric(counts_inactive(data_here)[gene, ]) + 1,
timepoint = as.numeric(data_here$ndDox),
experiment = data_here$Experiment
) %>% mutate(total = active + inactive) %>%
mutate(rate = inactive / (active + inactive))
tryCatch({
rates.nls.offset <- nls(rate ~ a * exp(- k * timepoint) + b, data = df_here, algorithm = "port", start = c(a = 0.5, k = 0.1, b = 0.1), lower = c(a = -Inf, k = 0, b = 0))
return(c(coef(rates.nls.offset), modelr::rsquare(rates.nls.offset, data = df_here), logLik(rates.nls.offset)))
},
error=function(cond) {
return(NA)
})
}
# Run the fits and compute half lifes (t1/2 = log(0.5) / (-k))
general_escapees <- data_here[escapees_use, ]
all_coefs <- data.frame(do.call("rbind", lapply(rownames(general_escapees), fit_nls_curve_for_gene)))
all_coefs$gene <- rownames(general_escapees)
all_coefs$halflifes <- log(1/2) / ( - all_coefs$k )
#
all_coefs_offset <- data.frame(do.call("rbind", lapply(rownames(general_escapees), fit_nls_curve_for_gene_offset)))
all_coefs_offset$gene <- rownames(general_escapees)
all_coefs_offset$halflifes <- log(1/2) / ( - all_coefs_offset$k )
# how many convergence errors?
table(is.na(all_coefs$a), is.na(all_coefs_offset$a)) # there is a small amount of genes where the offset model doesnt converge, exclude these genes
##
## FALSE TRUE
## FALSE 147 15
# the excluded genes are mainly pathological, e.g. non-monotonous data
all_coefs_offset$category <- rowData(data_here[all_coefs$gene, ])$EscapeAnnotation
# Merge and compute bayes information critera for both models
merged_df <- data.frame(
gene = all_coefs$gene,
category = all_coefs_offset$category,
a = all_coefs$a,
k = all_coefs$k,
r2 = all_coefs$V3,
loglik = all_coefs$V4,
halflifes = all_coefs$halflifes,
a_off = all_coefs_offset$a,
k_off = all_coefs_offset$k,
b_off = all_coefs_offset$b,
r2_off = all_coefs_offset$V4,
loglik_off = all_coefs_offset$V5,
halflifes_off = all_coefs_offset$halflifes
) %>%
mutate(BIC_native = 3 * log(ncol(data_here)) - 2 * loglik) %>%
mutate(BIC_offset = 4 * log(ncol(data_here)) - 2 * loglik_off)
# These genes fail convergence - all but 5 have very low r2 in the non-offset model
merged_df %>%
dplyr::filter(is.na(a_off)) %>%
dplyr::arrange(r2)
## gene category a k r2
## 1 Drp2 facultative 0.10134444 0.00000000 -2.220446e-16
## 2 Timp1 facultative 0.23694296 0.00000000 -2.220446e-16
## 3 Cox7b silenced / variable 0.19528361 0.00000000 0.000000e+00
## 4 Apoo silenced / variable 0.15819510 0.00000000 0.000000e+00
## 5 Mid1 constitutive 0.06238980 0.00000000 0.000000e+00
## 6 Jpx constitutive 0.28544042 0.00000000 1.110223e-16
## 7 Gm14680 <NA> 0.34995245 0.01595671 1.529868e-02
## 8 Map3k15 silenced / variable 0.08368699 0.06303914 8.465674e-02
## 9 3110067C02Rik <NA> 0.23396680 0.05688354 1.229768e-01
## 10 Hdac8 silenced / variable 0.09678076 0.07189294 2.606631e-01
## 11 5530601H04Rik constitutive 0.34956170 0.06017532 4.253086e-01
## 12 Ap1s2 facultative 0.16989407 0.47278973 6.594223e-01
## 13 Car5b facultative 0.13348331 1.32179266 8.304988e-01
## 14 Ccdc120 silenced / variable 0.15108646 0.67908118 9.158316e-01
## 15 A230072C01Rik facultative 0.21724140 0.63039899 9.360657e-01
## loglik halflifes a_off k_off b_off r2_off loglik_off halflifes_off
## 1 12.3894854 Inf NA NA NA NA NA NA
## 2 1.7240939 Inf NA NA NA NA NA NA
## 3 9.6866505 Inf NA NA NA NA NA NA
## 4 9.3651397 Inf NA NA NA NA NA NA
## 5 18.1205953 Inf NA NA NA NA NA NA
## 6 0.4092039 Inf NA NA NA NA NA NA
## 7 3.6193003 43.4392210 NA NA NA NA NA NA
## 8 17.5958620 10.9955058 NA NA NA NA NA NA
## 9 9.6193858 12.1853730 NA NA NA NA NA NA
## 10 24.4879403 9.6413803 NA NA NA NA NA NA
## 11 12.9049880 11.5187960 NA NA NA NA NA NA
## 12 21.1810642 1.4660792 NA NA NA NA NA NA
## 13 27.1774098 0.5243993 NA NA NA NA NA NA
## 14 30.4540347 1.0207133 NA NA NA NA NA NA
## 15 27.1789323 1.0995373 NA NA NA NA NA NA
## BIC_native BIC_offset
## 1 -17.3242508 NA
## 2 4.0065321 NA
## 3 -11.9185811 NA
## 4 -11.2755595 NA
## 5 -28.7864707 NA
## 6 6.6363121 NA
## 7 0.2161193 NA
## 8 -27.7370041 NA
## 9 -11.7840516 NA
## 10 -41.5211607 NA
## 11 -18.3552560 NA
## 12 -34.9074085 NA
## 13 -46.9000996 NA
## 14 -53.4533495 NA
## 15 -46.9031446 NA
# Exclude 24 genes with r2 < 0.3, don't fit exponential model
merged_df %>%
dplyr::filter(!(r2 > 0.3 & r2_off > 0.3)) %>% dim()
## [1] 22 15
merged_df <- merged_df %>%
dplyr::filter(r2 > 0.3 & r2_off > 0.3)
# We now ask which genes fit the offset vs the native model better (supplement)
merged_df %>%
ggplot(aes(x = BIC_native, BIC_offset, col = category)) + geom_point() + geom_abline() +
ggrepel::geom_text_repel(aes(label = gene)) + theme_paper() + xlab("BIC exponential decay model") + ylab("BIC exponential decay model + offset") +
labs(col = "Escape category") + scale_color_manual(values = escape_colors)
ggsave("./FiguresIllustrator/FigS1/bic_model_choice.pdf", width = 9, height = 6)
merged_df %>%
ggplot(aes(x = halflifes_off, b_off, col = category, shape = BIC_native > BIC_offset)) +
geom_point(size = 3) + ylim(c(0, 0.3)) + ggrepel::geom_text_repel(aes(label = gene), size = 5) +
theme_paper() + scale_color_manual(values = escape_colors) + theme_paper() + xlab("half-life (non-linear regression) [days]") +
ylab("offset parameter (non-linear regression)") + labs(col = "Escape category", shape = "Offset model favored (BIC)") + scale_x_log10()
ggsave("./FiguresIllustrator/Fig1/half_life_vs_baseline.pdf", width = 9, height = 6)
merged_df %>%
dplyr::filter(!is.na(category)) %>%
ggplot(aes(x = category, y = halflifes_off, fill = category)) + geom_boxplot(width = 0.1, outlier.color = NA) + ggbeeswarm::geom_quasirandom() + theme_paper() +
scale_fill_manual(values = escape_colors) + xlab("") + ylab("half-life (non-linear regression) [days]") + ggtitle("Escape half-life")
merged_df %>%
dplyr::filter(!is.na(category)) %>%
ggplot(aes(x = category, y = b_off, fill = category)) + geom_boxplot(width = 0.1, outlier.color = NA) + ggbeeswarm::geom_quasirandom() + theme_paper() +
scale_fill_manual(values = escape_colors) + xlab("") + ylab("offset parameter (non-linear regression)") + ggtitle("Residual escape")
merged_df %>%
dplyr::filter(!is.na(category)) %>%
ggplot(aes(x = category, y = halflifes_off, fill = category)) + geom_boxplot(width = 0.1, outlier.color = NA) + ggbeeswarm::geom_quasirandom() +theme_paper() +
scale_fill_manual(values = escape_colors) + xlab("") + ylab("half-life (non-linear regression) [days]") + ggtitle("Escape half-life") +
ggrepel::geom_text_repel(aes(label = gene))
ggsave("./FiguresIllustrator/Fig1/half_life_category.pdf", width = 9, height = 6)
merged_df %>%
dplyr::filter(!is.na(category)) %>%
ggplot(aes(x = category, y = b_off, fill = category)) + geom_boxplot(width = 0.1, outlier.color = NA) + ggbeeswarm::geom_quasirandom() + theme_paper() +
scale_fill_manual(values = escape_colors) + xlab("") + ylab("offset parameter (non-linear regression)") + ggtitle("Residual escape") +
ggrepel::geom_text_repel(aes(label = gene))
ggsave("./FiguresIllustrator/Fig1/residual_escape_category.pdf", width = 9, height = 6)
# Here we can check individual genes
plot_decay_gene <- function(gene, save_plot = F, path = "./"){
df_here <- data.frame(
active = as.numeric(counts_active(data_here)[gene, ]) + 1,
inactive = as.numeric(counts_inactive(data_here)[gene, ]) + 1,
timepoint = as.numeric(data_here$ndDox),
experiment = data_here$Experiment
) %>% mutate(total = active + inactive) %>% mutate(rate = inactive / (active + inactive))
df_native <- all_coefs %>% column_to_rownames("gene")
df_offset <- all_coefs_offset %>% column_to_rownames("gene")
data.frame(
active = as.numeric(counts_active(data_here)[gene, ]) + 1,
inactive = as.numeric(counts_inactive(data_here)[gene, ]) + 1,
timepoint = as.numeric(data_here$ndDox),
experiment = data_here$Experiment
) %>% mutate(rates = inactive / (inactive + active)) %>%
ggplot(aes(x = timepoint, y = rates)) + geom_point() + theme_paper() +
geom_function(fun = function(x, a = df_native[gene, ]$a, k = df_native[gene, ]$k) exp_decay(x, a, k),
linetype = "dashed", aes(col = "Exp. Decay Model")) + ylab("allelic ratio") +
geom_function(fun = function(x, a = df_offset[gene, ]$a, k = df_offset[gene, ]$k, c = df_offset[gene, ]$b) offset_exp_decay(x, a, k, c),
linetype = "dashed", aes(col = "Exp. Decay Model + Offset")) + ylab("allelic ratio") + labs(colour = "Model") +
ylim(c(0, 1)) + ggtitle(gene) + theme(legend.position = c(0.8, 0.9)) + scale_color_manual(values = c("red", "darkred")) +
theme(legend.background = element_rect(size=0.5, linetype="solid", colour ="black")) +
annotate("text", label = paste0("R^2: ", round(all_coefs[all_coefs$gene == gene, ]$V3, digits = 4)), x = 2, y = 1, size = 8) +
annotate("text", label = paste0("t1/2: ", round(all_coefs[all_coefs$gene == gene, ]$halflifes, digits = 4)), x = 2, y = 0.9, size = 8) +
xlab("Days after after Xist induction") + ylab("d-score (B6 / (B6 + CAST)") -> p1
if (save_plot){
ggsave(paste0(path, "decay_curve_", gene, ".pdf"), p1, width = 9, height = 6)
}
return(p1)
}
plot_decay_gene("Pbdc1", save_plot = T, path = "./FiguresIllustrator/Fig1/")
plot_decay_gene("Med14", save_plot = T, path = "./FiguresIllustrator/Fig1/")
plot_decay_gene("Mecp2", save_plot = T, path = "./FiguresIllustrator/Fig1/")
plot_decay_gene("Tfe3", save_plot = T, path = "./FiguresIllustrator/Fig1/")
genes_plot <- merged_df[merged_df$category == "constitutive", ]$gene
genes_plot <- genes_plot[!is.na(genes_plot)]
for (gene in genes_plot){
plot_decay_gene(gene, save_plot = T, path = "./FiguresIllustrator/Fig1/")
}